The purpose of this file is processing the combined data files for Summer 2022 into files that contain only valid data for analysis, excluding invalid sessions and participants

Data is imported from 2 files, indicating two levels of analysis: participants and blocks (item-level).

Note: mouse-cursor data contained in final_mouse_blocks.json file is not handled here.

#IMPORT DATA
df_participants <- fromJSON("input/su22_sgc4d_final_participants.json")
df_items <- fromJSON('input/su22_sgc4d_final_items.json')

#add term indicator
df_participants$term <- "summer22"
df_items$term <- "summer22"

#DEFINE SGC_4D validity crieria
sessions <- c('suPROLIFIC') #SGC4D running on prolific
conditions <-c(11111112,11311112) #2 conditions
violation_threshold = 4 #number of allowable browser violations
effort_exclusion = c("I didn't try very hard, or rushed through the questions", "I started out trying hard, but gave up at some point")
n_items = 15 #fifteen items is complete dataset per participant

#placeholder for excluding participants
ex_participants = data.frame()

note : We drop all scores calculated in the stimulus engine (except absolute score, which uses simple # strictly correct), as they are recalculate during analysis using a different MC scoring algorithm.

#create factors in PARTICIPANTS
df_participants <- df_participants %>%  
  mutate( #create factors and remove extraneous ""
    subject=as.character(subject),
    condition=as.character(condition),
    pretty_condition = recode_factor(condition, 
                                     "11111112" = "ORTH-equilateral", 
                                     "11311112" =  "TRI-equilateral"),
    study = factor(study),
    session = factor(session),
    exp_id = factor(exp_id),
    sona_id = as.character(sona_id),
    pool = factor(pool),
    mode = factor(mode),
    attn_check = factor(attn_check),
    status=factor(status),
    term=factor(term),
    gender = as.factor(gender),
    age = as.integer(age),
    country = gsub('"',"",country),
    year = factor(schoolyear),
    major = factor(major),
    browser = factor(browser),
    os = factor(os),
    native_language = factor(language),
    totaltime_m = totaltime/1000/60,
   ) %>% select( #order cols 
    subject,
    study,
    condition,
    pretty_condition,
    session,
    exp_id,
    sona_id,
    pool,
    mode,
    attn_check,
    # explanation,
    effort,
    difficulty,
    confidence,
    enjoyment,
    other,
    age,
    country,
    language,
    schoolyear,
    major,
    gender,
    disability,
    browser,
    width,
    height,
    os,
    starttime,
    status,
    term,
    violations,
    absolute_score,
    # discriminant_score,
    # tri_score,
    # orth_score,
    # other_score,
    # blank_score,
    totaltime_m
   )  

#NOT THAT WE DROP ALL SCORES, WHICH ARE INCORRECTLY CALCULATED IN THE stimulus engine. We do not drop the raw responses (answers)
df_items <- df_items %>% 
  mutate(
    # subject=factor(subject),
    # condition=factor(condition),
    pretty_condition = recode_factor(condition, 
                                     "11111112" = "ORTH-equilateral", 
                                     "11311112" =  "TRI-equilateral"),
    pool=factor(pool),
    mode = factor(mode),
    # explicit=factor(explicit),
    # impasse = factor(impasse),
    # grid = factor(grid),
    # mark = factor(mark),
    # ixn = factor(ixn),
    term=factor(term),
    relation = factor(relation),
    block = factor(block),
    correct = factor(correct),
    q=factor(q),
    rt_s = rt/1000,
    time_elapsed_m = time_elapsed/1000/60
  ) %>% select(
     subject,
     study,
     term,
     pool,
     mode,
     condition,
     pretty_condition,
     block,
     explicit,
     impasse,
     grid,
     mark,
     ixn,
     gwidth,
     gheight,
     graph,
     time_elapsed_m,
     question,
     relation,
     q,
     correct,
     # discriminant,
     # tri_score,
     # orth_score,
     # other_score,
     # blank_score,
     answer,
     rt_s
   )  #WE DROP ALL SCORES BC THEY ARE RESCORED IN ANALYSIS FILE

1 Data Validation

1.1 Exclusions

1.1.1 Completion Status

Starting with Winter 2022, data are saved to the database even if the subject’s browser did not meet minimum specifications (at which point they are prompted to change browsers, or end the study). This allows us to learn about the browsers, screen sizes and OS that (potential) subjects are using. However, these data are not exported from the database for analysis (see flatten.js and status.js scripts). Thus, only subjects who successfully completed the entire study are included in this file.

#MANUALLY INSPECT status
df_participants %>% group_by(status) %>% 
  dplyr::summarize(n=n())
## # A tibble: 1 × 2
##   status      n
##   <fct>   <int>
## 1 success   122

122 successfully completed the study.

#DISCARD participants from invalid sessions 
exclude_status <- df_participants %>% 
          filter(status != "success") %>% 
          mutate(reason="invalid-status")

ex_participants <- rbind(ex_participants, exclude_status)
rm(exclude_status)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

No data need to be excluded on account of completion status.

1.1.2 Conditions

Participants are randomly assigned to an experimental condition when starting the study. Here we validate that only conditions for the current study are included in this dataset.

#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>% 
  dplyr::summarize(n=n())
## # A tibble: 2 × 2
##   condition     n
##   <chr>     <int>
## 1 11111112     60
## 2 11311112     62

Data from conditions not corresponding to valid conditions should be discarded.

#DISCARD participants from conditions invalid for this study
exclude_condition <- df_participants %>% 
          filter(!condition %in% conditions) %>% 
          mutate(reason="invalid-condition")

ex_participants <- rbind(ex_participants, exclude_condition)
rm(exclude_condition)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

No data need to be excluded on account of condition.

1.1.3 Sessions

The (string) session code is embedded in the URL querystring by the experimenter to differentiate testing sessions in SONA from demo and other environment setup tasks.

#MANUALLY INSPECT sessions
df_participants %>% group_by(session) %>% 
  dplyr::summarize(n=n())
## # A tibble: 1 × 2
##   session        n
##   <fct>      <int>
## 1 suPROLIFIC   122

Data from sessions not corresponding to valid sessions should be discarded.

#DISCARD participants from invalid sessions 
exclude_session <- df_participants %>% 
          filter(!session %in% sessions) %>% 
          mutate(reason="invalid-session")

ex_participants <- rbind(ex_participants, exclude_session)
rm(exclude_session)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

No participants are excluded on account of session (ie. app testing or pilot session).

1.1.4 Browser Interaction Violations

Browser interaction data is recorded by jspsych allowing us to determine if subjects violate our instructions not to leave the browser tab (or exit fullscreen mode) during test. These incidents are recorded in jspsych interaction data object, and the number of violations is counted and added to the participant data file.

Due to eccentricity of the browser events captured, 1-2 browser violations can be captured even if the subject did not leave the browser window (eg. in case of resizing window to meet minimum requirements.)

#MANUALLY INSPECT violations
df_participants %>% group_by(violations) %>% 
  dplyr::summarize(n=n())
## # A tibble: 11 × 2
##    violations     n
##         <dbl> <int>
##  1        1      73
##  2        1.5     2
##  3        2      21
##  4        2.5     2
##  5        3      11
##  6        3.5     4
##  7        4       5
##  8        4.5     1
##  9        5.5     1
## 10        6       1
## 11        7       1
#DISCARD participants exceeding the threshold of browser interaction violations 
exclude_violations <- df_participants %>% 
          filter(violations > violation_threshold) %>% 
          mutate(reason="exceeded-violations")

ex_participants <- rbind(ex_participants, exclude_violations)
rm(exclude_violations)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

Four participants were excluded for exceeding the maximum allowed number of browser interaction violations.

1.1.5 Effort

To assist in mitigating increased noise in data collected asynchronously from the UCSD student subject pool, we added explicit ratings of how much effort the participant expended on the task. This question was implemented as a multiple-choice drop-down on an ‘Effort’ page prior to the ‘Demographics’ survey at the end of the study. Subjects were given four options : (1) I tried my best on each question, (2) I tried my best on most questions, (3) I started out trying hard, but gave up at some point, (4) I didn’t try very hard, or rushed through the questions.

#MANUALLY INSPECT effort
df_participants %>% group_by(effort) %>% 
  dplyr::summarize(n=n())
## # A tibble: 3 × 2
##   effort                                                   n
##   <chr>                                                <int>
## 1 I started out trying hard, but gave up at some point     1
## 2 I tried my best on each question                       110
## 3 I tried my best on most questions                        7

Participants answering with options I didn’t try very hard, or rushed through the questions or I started out trying hard, but gave up at some point are excluded from analysis.

#DISCARD participants who indicated they did not expend adequate effort on the study
exclude_effort <- df_participants %>% 
          filter(effort %in% effort_exclusion) %>% 
          mutate(reason="selfrated-effort")

ex_participants <- rbind(ex_participants, exclude_effort)
rm(exclude_effort)     

df_participants <- df_participants %>% 
  filter( ! subject %in% ex_participants$subject)

One participant is excluded for low (self-rated) effort.

1.1.6 Attention Check

The 6th question in the study is non-discriminatory (can easily get correct answer regardless of strategy) and serves as an attention check question.

#MANUALLY INSPECT attention
df_participants %>% group_by(attn_check) %>% 
  dplyr::summarize(n=n())
## # A tibble: 2 × 2
##   attn_check     n
##   <fct>      <int>
## 1 FALSE         18
## 2 TRUE          99

Participants who answered the attention check question incorrectly should be excluded.

#DISCARD participants who indicated they did not expend adequate effort on the study
# exclude_attn <- df_participants %>% 
#           filter(attn_check == FALSE) %>% 
#           mutate(reason="failed-attnchk")
# 
# ex_participants <- rbind(ex_participants, exclude_attn)
# rm(exclude_attn)     
# 
# df_participants <- df_participants %>% 
#   filter( ! subject %in% ex_participants$subject)

No participants are excluded for failing the attention check question.

1.1.7 Items

Next, we need to discard item_level data for excluded participants.

ex_items <- df_items %>% 
  filter (subject %in% ex_participants$subject) 

df_items <- df_items %>% 
  filter (!subject %in% ex_participants$subject )

1.2 Validation

After all exclusions, we are left with the following number of participants per condition:

#MANUALLY INSPECT conditions
df_participants %>% group_by(condition) %>% 
  dplyr::summarize(n=n())
## # A tibble: 2 × 2
##   condition     n
##   <chr>     <int>
## 1 11111112     57
## 2 11311112     60

Finally, we need to validate we have a complete set of items for all valid participants.

count(df_items)[[1]] == count(df_participants)[[1]]* n_items 
## [1] TRUE

2 Participants Codebook

#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html

#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4d_participants.csv", "csv") #import data dictionary
var_label(df_participants) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels

#ADD DATASET METATDATA
metadata(df_participants)$name <- "Experimental PARTICIPANTS for study SGC4D"
metadata(df_participants)$description <- "Data for study SGC4D summarized at PARTICIPANT  level"
metadata(df_participants)$creator <- "Amy Rae Fox"
metadata(df_participants)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF KNIT
codebook::skim_codebook(df_participants) 
Data summary
Name data
Number of rows 117
Number of columns 32
_______________________
Column type frequency:
character 10
factor 13
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
subject 0 1 5 5 0 117 0
condition 0 1 8 8 0 2 0
sona_id 0 1 24 24 0 112 0
effort 0 1 32 33 0 2 0
other 0 1 0 402 59 58 0
country 0 1 2 44 0 11 0
language 0 1 7 7 0 1 0
schoolyear 0 1 7 27 0 7 0
disability 0 1 0 72 46 25 0
starttime 0 1 24 24 0 117 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
study 0 1 FALSE 1 SGC: 117
pretty_condition 0 1 FALSE 2 TRI: 60, ORT: 57
session 0 1 FALSE 1 suP: 117
exp_id 0 1 FALSE 4 630: 40, 630: 37, 630: 20, 630: 20
pool 0 1 FALSE 1 pro: 117
mode 0 1 FALSE 1 asy: 117
attn_check 0 1 FALSE 2 TRU: 99, FAL: 18
major 0 1 FALSE 7 Mat: 32, Soc: 26, Fin: 20, Hum: 18
gender 0 1 FALSE 3 Mal: 59, Fem: 51, Oth: 7
browser 0 1 FALSE 1 chr: 117
os 0 1 FALSE 5 Win: 82, Mac: 27, Chr: 4, Win: 2
status 0 1 FALSE 1 suc: 117
term 0 1 FALSE 1 sum: 117

Variable type: numeric

skim_variable n_missing complete_rate mean sd min median max hist
difficulty 0 1 3.54 0.98 1.00 4.00 5.00 ▁▂▅▇▃
confidence 0 1 3.38 1.06 1.00 3.00 5.00 ▁▃▇▇▃
enjoyment 0 1 3.45 1.20 1.00 4.00 5.00 ▂▆▇▇▇
age 0 1 34.63 11.83 19.00 32.00 71.00 ▇▆▃▂▁
width 0 1 1642.15 303.85 1143.00 1536.00 2752.00 ▇▇▆▁▁
height 0 1 843.73 145.20 685.00 785.00 1329.00 ▇▃▃▁▁
violations 0 1 1.62 0.91 1.00 1.00 4.00 ▇▂▁▁▁
absolute_score 0 1 1.79 3.46 0.00 0.00 12.00 ▇▁▁▁▁
totaltime_m 0 1 12.47 6.45 2.15 11.31 38.43 ▇▇▂▁▁
codebook(df_participants, #ONLY FOR HTML KNIT
         metadata_table = TRUE,
         detailed_variables = FALSE,
         detailed_scales = FALSE,
         metadata_json = FALSE,
         survey_overview = FALSE,
         missingness_report = FALSE)

2.0.1 Metadata

2.0.1.1 Description

Dataset name: Experimental PARTICIPANTS for study SGC4D

Data for study SGC4D summarized at PARTICIPANT level

Metadata for search engines
  • Date published: 2022-08-26

  • Creator:

name value
1 Amy Rae Fox
x
x
subject
study
condition
pretty_condition
session
exp_id
sona_id
pool
mode
attn_check
effort
difficulty
confidence
enjoyment
other
age
country
language
schoolyear
major
gender
disability
browser
width
height
os
starttime
status
term
violations
absolute_score
totaltime_m

2.1 Codebook table

3 Items Codebook

#see https://cran.r-project.org/web/packages/codebook/vignettes/codebook_tutorial.html

#ADD VARIABLE METADATA
dict <- rio::import("input/dictionary_sgc4d_items.csv", "csv") #import data dictionary

var_label(df_items) <- dict %>% select(VARIABLE, DESCRIPTION) %>% dict_to_list() #add variable labels

#ADD DATASET METATDATA
metadata(df_items)$name <- "Experimental ITEMS for study SGC4D"
metadata(df_items)$description <- "Data for study SGC4D summarized at participant-item level"
metadata(df_items)$creator <- "Amy Rae Fox"
metadata(df_items)$contact <- "amyraefox@gmail.com"
#{r, eval = checkMode() == "pdf"} #ONLY FOR PDF EXPORT
skim_codebook(df_items) 
Data summary
Name data
Number of rows 1755
Number of columns 23
_______________________
Column type frequency:
character 11
factor 8
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
subject 0 1 5 5 0 117 0
study 0 1 5 5 0 1 0
condition 0 1 8 8 0 2 0
explicit 0 1 1 1 0 1 0
impasse 0 1 1 1 0 1 0
grid 0 1 1 1 0 2 0
mark 0 1 1 1 0 1 0
ixn 0 1 1 1 0 1 0
graph 0 1 10 10 0 1 0
question 0 1 26 87 0 15 0
answer 0 1 0 25 42 94 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
term 0 1 FALSE 1 sum: 1755
pool 0 1 FALSE 1 pro: 1755
mode 0 1 FALSE 1 asy: 1755
pretty_condition 0 1 FALSE 2 TRI: 900, ORT: 855
block 0 1 FALSE 3 ite: 819, ite: 585, ite: 351
relation 0 1 FALSE 10 end: 234, mee: 234, mid: 234, sta: 234
q 0 1 FALSE 15 1: 117, 2: 117, 3: 117, 4: 117
correct 0 1 FALSE 2 FAL: 1340, TRU: 415

Variable type: numeric

skim_variable n_missing complete_rate mean sd min median max hist
gwidth 0 1 600.00 0.00 600.00 600.00 600.00 ▁▁▇▁▁
gheight 0 1 600.00 0.00 600.00 600.00 600.00 ▁▁▇▁▁
time_elapsed_m 0 1 6.88 5.22 0.35 5.71 37.39 ▇▃▁▁▁
rt_s 0 1 34.79 38.23 0.14 22.03 425.35 ▇▁▁▁▁
codebook(df_items,#ONLY FOR HTML EXPORT
         metadata_table = TRUE,
         detailed_variables = FALSE,
         detailed_scales = FALSE,
         metadata_json = FALSE,
         survey_overview = FALSE,
         missingness_report = FALSE)

3.0.1 Metadata

3.0.1.1 Description

Dataset name: Experimental ITEMS for study SGC4D

Data for study SGC4D summarized at participant-item level

Metadata for search engines
  • Date published: 2022-08-26

  • Creator:

name value
1 Amy Rae Fox
x
x
subject
study
term
pool
mode
condition
pretty_condition
block
explicit
impasse
grid
mark
ixn
gwidth
gheight
graph
time_elapsed_m
question
relation
q
correct
answer
rt_s

3.1 Codebook table

4 Explore

Exploration of the distribution of key response variables for validation purposes:

gf_histogram( ~absolute_score ,data = df_participants) + 
  labs(title = "SGC4D Distribution of Absolute Score")

gf_dhistogram( ~absolute_score ,data = df_participants) %>% 
  gf_facet_wrap(~pretty_condition) +
  labs(title = "SGC4D Distribution of Absolute Score (by Condition)")

gf_props(~correct, data = df_items) + 
  labs(title = "SGC4D Distribution of Item Absolute Score")

gf_props(~correct, data = df_items) %>% 
  gf_facet_wrap(~pretty_condition) + 
  labs(title = "SGC4D Distribution of Item Absolute Score (by Condition)")

gf_histogram( ~totaltime_m ,data = df_participants) + 
  labs(title = "SGC4D Distribution of Total Study Time")

gf_histogram( ~absolute_score ,data = df_participants) %>% 
  gf_facet_wrap(~pretty_condition) +
  labs(title = "SGC4D Distribution of Absolute Score")

gf_histogram(~rt_s, data = df_items) + 
  labs(title = "SGC4D Distribution of Item Response Time")

gf_jitter(totaltime_m ~ absolute_score , data = df_participants) + 
  labs(title = "SGC4D Item Response Time vs Accuracy")

library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats( data = df_participants, x = pretty_condition, y = absolute_score,
                type = "nonparametric")

5 Data Export

5.1 Save Exclusions

For transparency, we save and identify the excluded data.

write.csv(ex_participants,"output/excluded_participants_summer22_sgc4d.csv", row.names = FALSE)
write.csv(ex_items,"output/excluded_items_summer22_sgc4d.csv", row.names = FALSE)

5.2 Analysis-Ready Files

#CSV files
write.csv(df_participants,"output/sgc4d_participants.csv", row.names = FALSE)
write.csv(df_items,"output/sgc4d_items.csv", row.names = FALSE)

#export R DATA STRUCTURES (include codebook metadata)
rio::export(df_participants, "output/sgc4d_participants.rds") # to R data structure file
rio::export(df_items, "output/sgc4d_items.rds") # to R data structure file